#Dennis Moskov, Master Thesis
#subsample data base by different criteria

DB<-master


#by reduction time > 4 h
DBlow<-DB[(DB$red.t<=4),]
DBlow<-DBlow[,colSums(DBlow!=0)>0]
DBhigh<-DB[(DB$red.t>4),]
DBhigh<-DBhigh[,colSums(DBhigh!=0)>0]

#by preparation method = 1
DB1<-DB[(DB$prep.meth==1),]
DB1<-DB1[,colSums(DB1!=0)>0]
DB1<-DB1[,names(DB1)!="prep.meth"]
DB0<-DB[(DB$prep.meth!=1),]
DB0<-DB0[,colSums(DB0!=0)>0]

#by GHSV  1200 - 6000
DBghsv<-DB[(DB$rxn.GHSV>=1200 & DB$rxn.GHSV<=6000),]
DBghsv<-DBghsv[,colSums(DBghsv!=0)>0]

#by reduction temperature < 555 K
DBlow<-DB[(DB$red.T<=555),]
DBlow<-DBlow[,colSums(DBlow!=0)>0]
DBhigh<-DB[(DB$red.T>555),]
DBhigh<-DBhigh[,colSums(DBhigh!=0)>0]

#by CO2
DBco2<-DB[DB$comp.CO==0,]
DBco2<-DBco2[,colSums(DBco2!=0)>0]
DBcox<-DB[(DB$comp.CO>0 & DB$comp.CO2>0),]
DBcox<-DBcox[,colSums(DBcox!=0)>0]
DBco<-DB[(DB$comp.CO>0 & DB$comp.CO2<7.5),]
DBco<-DBco[,colSums(DBco!=0)>0]

#by commercial catalyst CU/ZnO/Al2O3 >= 75%
DBcom<-DB[(DB$CuO>0 | DB$ZnO>0 | DB$Al2O3 >0),]
DBcom<-DBcom[,colSums(DBcom!=0)>0]


#by PAMK 2 cluster
DB1 <-DB[DB$clustering==1,]
DB1<-DB1[,colSums(DB1!=0)>0]
DB1<-DB1[-2]
DB2 <-DB[DB$clustering==2,]
DB2<-DB2[,colSums(DB1!=0)>0]
DB2<-DB2[-2]

#by hierarchical 2 cluster
DB1 <-DB[DB$hClusdendo==1,]
DB1<-DB1[,colSums(DB1!=0)>0]
DB1<-DB1[-2]
DB2 <-DB[DB$hClusdendo==2,]
DB2<-DB2[,colSums(DB1!=0)>0]
DB2<-DB2[-2]

#by random forest 2 cluster
DB <-DB[DB$clustering==1,]
DB<-DB[,colSums(DB!=0)>0]
DB<-DB[-2]




#standardize only numerical input variables
#add columns accordingly to scalevar and DBs
scalevar<-setdiff(names(DB),c("prep.meth","article","X.MeOH","S.MeOH","Y.MeOH"))  
DBs<-data.frame(sapply(DB[,scalevar],scale),prep.meth=DB[,"prep.meth"],article=DB[,"article"],X.MeOH=DB[,"X.MeOH"],S.MeOH=DB[,"S.MeOH"],Y.MeOH=DB[,"Y.MeOH"])
DBs<-DBs[,names(DB)] # to get the original order back

#encode responses to fractions
DBs[(length(DBs)-2):length(DBs)]<-DBs[(length(DBs)-2):length(DBs)]/100

DB<-DBs






